package cn.edu.bjtu.model.dl4jword2vec;
import org.datavec.api.util.ClassPathResource;
import org.deeplearning4j.models.embeddings.WeightLookupTable;
import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable;
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.embeddings.wordvectors.WordVectors;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.edu.bjtu.abstractimpl.analyzer.AnsjDocumentAnalyzer;
import cn.edu.bjtu.classimpl.parser.DataBaseWechatParser;
import cn.edu.bjtu.classimpl.parser.LineParser;
import cn.edu.bjtu.general.math.Arrays;
import cn.edu.bjtu.interfaces.document.IDocument;
import cn.edu.bjtu.interfaces.parser.Parser;
import cn.edu.bjtu.interfaces.segment.DocumentSegmentation;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Collection;

/**
 * This is simple example for model weights update after initial vocab building.
 * If you have built your w2v model, and some time later you've decided that it can be
 * additionally trained over new corpus, here's an example how to do it.
 *
 * PLEASE NOTE: At this moment, no new words will be added to vocabulary/model.
 * Only weights update process will be issued. It's often called "frozen vocab training".
 *
 * @author raver119@gmail.com
 * 2017-04-28  deeplearning4j  只支持64-bit JVM   
 */
public class Word2VecUptrainingExample {

    private static Logger log = LoggerFactory.getLogger(Word2VecUptrainingExample.class);
    private static DL4jWordVecModel dl4jWordVecModel = DL4jWordVecModel.getInstance();

    public static void main(String[] args) throws Exception {
        /*
                Initial model training phase
         */
    	String filePath = null;
		String dest = null;
		if(args == null|| args.length == 0 ){
			filePath = "H:\\wechat\\fdData.txt";
			dest = "F:\\worksspace\\word2vecfile\\model\\fdModel.txt";
		}else{
			filePath = args[0];
			dest = args[1];
		}
		// 首先需要分词 。。对训练文件预处理
		dl4jWordVecModel.processTrainFile(filePath);
		/*
        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = new BasicLineIterator("F:\\worksspace\\word2vecfile\\fdData\\trainfile\\split.txt");
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        // manual creation of VocabCache and WeightLookupTable usually isn't necessary
        // but in this case we'll need them
        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable<VocabWord> table = new InMemoryLookupTable.Builder<VocabWord>()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5)
                .iterations(1)
                .epochs(1)
                .layerSize(100)
                .seed(42)
                .windowSize(5)
                .iterate(iter)
                .tokenizerFactory(t)
                .lookupTable(table)
                .vocabCache(cache)
                .build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

//
////        Collection<String> lst = vec.wordsNearest("", 10);
////        log.info("Closest words to 'day' on 1st run: " + lst);
//
//        /*
//            at this moment we're supposed to have model built, and it can be saved for future use.
//         */
////        WordVectorSerializer.writeFullModel(vec, dest);
//        
////        //改写模型的方法生成的是二进制文件
////        WordVectorSerializer.writeWord2VecModel(vec, dest);
//        
        //该写模型的方法生成的格式为  word vec1 vec2 ......
      //  WordVectorSerializer.writeWordVectors(vec, dest);
//        Word2Vec word2vec = WordVectorSerializer.readWord2VecModel(dest);
//        System.out.println(Arrays.toString(word2vec.getWordVector("day")));
        
     
        
        

//        /*
//            Let's assume that some time passed, and now we have new corpus to be used to weights update.
//            Instead of building new model over joint corpus, we can use weights update mode.
//         */
//    	WordVectors ordVectors = WordVectorSerializer.readWord2Vec(new File("F:\\worksspace\\word2vecfile\\model\\dl4jModel.txt"));
//    	System.out.println(Arrays.toString(ordVectors.getWordVector("作为")));
//
//        /*
//            PLEASE NOTE: after model is restored, it's still required to set SentenceIterator and TokenizerFactory, if you're going to train this model
//         */
//        SentenceIterator iterator = new BasicLineIterator(filePath);
//        TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory();
//        tokenizerFactory.setTokenPreProcessor(new CommonPreprocessor());
//
//        word2Vec.setTokenizerFactory(tokenizerFactory);
//        word2Vec.setSentenceIterator(iterator);
//
//
//        log.info("Word2vec uptraining...");
//
//        word2Vec.fit();
//
//        lst = word2Vec.wordsNearest("day", 10);
//        log.info("Closest words to 'day' on 2nd run: " + lst);

        /*
            Model can be saved for future use now
         */
    }
}